In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import sys
import yaml
import numbers
import os
import plotly
import matplotlib.pyplot as plt
import scvelo as scv
from tqdm.notebook import tqdm
import itertools
import random
import seaborn as sns

import rpy2
import rpy2.robjects as ro

warnings.filterwarnings('ignore')
In [2]:
%load_ext rpy2.ipython
In [3]:
%matplotlib inline
In [4]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
In [5]:
plotly.offline.init_notebook_mode()
In [6]:
%matplotlib inline
In [7]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (9, 9)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5

Configure paths¶

In [8]:
hostRoot = "-".join(socket.gethostname().split('-')[0:2])

with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f:
    paths = yaml.load(f, Loader=yaml.FullLoader)

#indir=paths["paths"]["indir"][hostRoot]
outdir="./outdir/"
FinaLeaf="/Cycling"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]

Cells selection and pBulking¶

In [9]:
adata = sc.read_h5ad(outdir+FinaLeaf+"/4C_Cycling_DA.h5ad")
adataraw = sc.read_h5ad(outdir+"/1_polaroid_mint.h5ad")[adata.obs_names]
adataraw.obs = adata.obs
adata = adataraw
In [10]:
sc.pp.normalize_total(adata)
normalizing counts per cell
    finished (0:00:00)
In [11]:
adata.obs
Out[11]:
dataset organoid region type type_region regionContrast n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts ... S_score G2M_score phase Progenitor subLeiden umap_density_type umap_density_region umap_density_organoid DiffAbundant leiden_partition_final
AAACCCACAAGTTGGG-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 3851 8.256348 10479.0 9.257224 ... -0.032057 0.233361 G2M CyclingProgenitor 5 0.710058 0.558309 0.424233 0 not_enriched
AAAGGTAGTAGACGTG-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 2554 7.845808 5354.0 8.585786 ... 0.050134 -0.036817 S CyclingProgenitor 0 0.885485 0.718089 0.768943 0 not_enriched
AAGACAATCCATTTGT-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 1602 7.379632 3680.0 8.210939 ... -0.049984 0.937179 G2M CyclingProgenitor 4 0.370772 0.378476 0.268527 0 not_enriched
AAGGAATTCGTAGTCA-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 3390 8.128880 10772.0 9.284799 ... -0.026137 1.362167 G2M CyclingProgenitor 5 0.621688 0.447619 0.514371 0 not_enriched
AAGTTCGAGTGGTGAC-1_control3_piece2 control3_piece2 control3 piece2 control control_piece2 control 781 6.661855 1077.0 6.982863 ... -0.098444 0.007369 G2M CyclingProgenitor 0 0.555372 0.603375 0.682788 0 not_enriched
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTAGTCGTATGAGAT-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 3403 8.132706 9471.0 9.156096 ... -0.056564 1.110335 G2M CyclingProgenitor 4 0.415113 0.464044 0.083760 1 enriched
TTTAGTCGTCACTTAG-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 3365 8.121480 8708.0 9.072112 ... -0.129154 0.112132 G2M CyclingProgenitor 9 0.573504 0.613595 0.464877 1 enriched
TTTCCTCAGTACCGGA-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 3302 8.102586 7120.0 8.870804 ... 0.115321 0.010989 S CyclingProgenitor 2 0.758200 0.755870 0.805026 1 enriched
TTTGATCGTGCTATTG-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 5595 8.629807 19878.0 9.897419 ... 0.325876 0.413033 G2M CyclingProgenitor 6 0.597012 0.475544 0.350312 0 not_enriched
TTTGGAGCACAGTGTT-1_polaroid3_proximal polaroid3_proximal polaroid3 proximal polaroid polaroid_proximal proximal 3245 8.085179 10068.0 9.217216 ... 0.185670 -0.176771 S CyclingProgenitor 8 0.148207 0.091465 0.102741 0 not_enriched

2456 rows × 30 columns

In [12]:
adata.write_h5ad(outdir+FinaLeaf+"/5C_Cycling_preBulk.h5ad")

by region¶

In [13]:
groupingCovariate = "region"
PseudooReplicates_per_group = 10
print("Pbulking with target of "+str(PseudooReplicates_per_group)+" PRs will result in following cells per PR")
adata.obs.groupby(groupingCovariate).size() / PseudooReplicates_per_group
Pbulking with target of 10 PRs will result in following cells per PR
Out[13]:
region
distal       34.2
medial       14.5
piece1       20.5
piece2       19.6
piece3       17.4
proximal    139.4
dtype: float64
In [14]:
total = pd.DataFrame(index = adata.var_names)
total_metadata = pd.DataFrame(index= ["_".join(Rep) for Rep in  list(itertools.product(adata.obs[groupingCovariate].cat.categories.tolist(), [str(r)  for r in list(range(PseudooReplicates_per_group))]))])
for group in adata.obs[groupingCovariate].cat.categories:
    groupAdata = adata[adata.obs[groupingCovariate] == group]
    
    group_cells = groupAdata.obs_names.tolist()
    random.Random(4).shuffle(group_cells)
    
    metaCellslist=[group_cells[i::PseudooReplicates_per_group] for i in range(PseudooReplicates_per_group)]
    
    for partition in enumerate(metaCellslist):
        
        total['{}_{}'.format(group, partition[0])] = adata[partition[1]].X.sum(axis = 0).A1
    
        total_metadata.loc['{}_{}'.format(group, partition[0]),"group"] = group
        total_metadata.loc['{}_{}'.format(group, partition[0]),"pseudoreplicate"] = partition[0]
        total_metadata.loc['{}_{}'.format(group, partition[0]),"number_of_cell"] = int(len(partition[1]))
    
    #With this line we propagate - whenever possible -  obs to aggregated pReplicate
    for obsMD in [obsMD for obsMD in groupAdata.obs.columns.tolist() if len(groupAdata.obs[obsMD].unique()) == 1 and obsMD != groupingCovariate]:
        total_metadata.loc[["_".join(l) for l in list(itertools.product([group],[str(r)  for r in list(range(PseudooReplicates_per_group))]))], obsMD ] = adata.obs.loc[adata.obs[groupingCovariate] == group,obsMD][0]
        
        
total_metadata = total_metadata.dropna(axis = 1)



adata_bulk = sc.AnnData(total.transpose()).copy()
adata_bulk.var = adata.var.copy()
adata_bulk.obs = pd.concat([adata_bulk.obs, total_metadata], axis = 1)

adata_bulk.obs["group"] =adata_bulk.obs["group"].astype("category")
with open("./colorMap.yaml", 'r') as f:
    colorMap = yaml.load(f, Loader=yaml.FullLoader)["uns_colors"]
colorMap
adata_bulk.uns["group_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["group"].cat.categories]

adata_bulk.obs["regionContrast"] = adata_bulk.obs["regionContrast"].astype("category")
adata_bulk.uns["regionContrast_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["regionContrast"].cat.categories]



adata_bulk.write_h5ad(outdir+FinaLeaf+"/5C_Cycling_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.h5ad")
total.to_csv(outdir+FinaLeaf+"/5C_Cycling_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.tsv", sep="\t")
... storing 'type' as categorical
... storing 'type_region' as categorical
... storing 'is.Stressed' as categorical
... storing 'Progenitor' as categorical
In [ ]:

In [ ]:

In [ ]: